home *** CD-ROM | disk | FTP | other *** search
/ Personal Computer World 2006 December / PCWDEC06.iso / Software / Trial / Paint Shop Pro XI / Data1.cab / markupbase.py.0160FC08_F3D9_4869_9D41_C611C16F42D5 < prev    next >
Encoding:
Text File  |  2005-06-08  |  14.1 KB  |  384 lines

  1. """Shared support for scanning document type declarations in HTML and XHTML."""
  2.  
  3. import re
  4.  
  5. _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
  6. _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
  7. _commentclose = re.compile(r'--\s*>')
  8. _markedsectionclose = re.compile(r']\s*]\s*>')
  9.  
  10. # An analysis of the MS-Word extensions is available at
  11. # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
  12.  
  13. _msmarkedsectionclose = re.compile(r']\s*>')
  14.  
  15. del re
  16.  
  17.  
  18. class ParserBase:
  19.     """Parser base class which provides some common support methods used
  20.     by the SGML/HTML and XHTML parsers."""
  21.  
  22.     def __init__(self):
  23.         if self.__class__ is ParserBase:
  24.             raise RuntimeError(
  25.                 "markupbase.ParserBase must be subclassed")
  26.  
  27.     def error(self, message):
  28.         raise NotImplementedError(
  29.             "subclasses of ParserBase must override error()")
  30.  
  31.     def reset(self):
  32.         self.lineno = 1
  33.         self.offset = 0
  34.  
  35.     def getpos(self):
  36.         """Return current line number and offset."""
  37.         return self.lineno, self.offset
  38.  
  39.     # Internal -- update line number and offset.  This should be
  40.     # called for each piece of data exactly once, in order -- in other
  41.     # words the concatenation of all the input strings to this
  42.     # function should be exactly the entire input.
  43.     def updatepos(self, i, j):
  44.         if i >= j:
  45.             return j
  46.         rawdata = self.rawdata
  47.         nlines = rawdata.count("\n", i, j)
  48.         if nlines:
  49.             self.lineno = self.lineno + nlines
  50.             pos = rawdata.rindex("\n", i, j) # Should not fail
  51.             self.offset = j-(pos+1)
  52.         else:
  53.             self.offset = self.offset + j-i
  54.         return j
  55.  
  56.     _decl_otherchars = ''
  57.  
  58.     # Internal -- parse declaration (for use by subclasses).
  59.     def parse_declaration(self, i):
  60.         # This is some sort of declaration; in "HTML as
  61.         # deployed," this should only be the document type
  62.         # declaration ("<!DOCTYPE html...>").
  63.         # ISO 8879:1986, however, has more complex
  64.         # declaration syntax for elements in <!...>, including:
  65.         # --comment--
  66.         # [marked section]
  67.         # name in the following list: ENTITY, DOCTYPE, ELEMENT,
  68.         # ATTLIST, NOTATION, SHORTREF, USEMAP,
  69.         # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
  70.         rawdata = self.rawdata
  71.         j = i + 2
  72.         assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
  73.         if rawdata[j:j+1] in ("-", ""):
  74.             # Start of comment followed by buffer boundary,
  75.             # or just a buffer boundary.
  76.             return -1
  77.         # A simple, practical version could look like: ((name|stringlit) S*) + '>'
  78.         n = len(rawdata)
  79.         if rawdata[j:j+1] == '--': #comment
  80.             # Locate --.*-- as the body of the comment
  81.             return self.parse_comment(i)
  82.         elif rawdata[j] == '[': #marked section
  83.             # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
  84.             # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
  85.             # Note that this is extended by Microsoft Office "Save as Web" function
  86.             # to include [if...] and [endif].
  87.             return self.parse_marked_section(i)
  88.         else: #all other declaration elements
  89.             decltype, j = self._scan_name(j, i)
  90.         if j < 0:
  91.             return j
  92.         if decltype == "doctype":
  93.             self._decl_otherchars = ''
  94.         while j < n:
  95.             c = rawdata[j]
  96.             if c == ">":
  97.                 # end of declaration syntax
  98.                 data = rawdata[i+2:j]
  99.                 if decltype == "doctype":
  100.                     self.handle_decl(data)
  101.                 else:
  102.                     self.unknown_decl(data)
  103.                 return j + 1
  104.             if c in "\"'":
  105.                 m = _declstringlit_match(rawdata, j)
  106.                 if not m:
  107.                     return -1 # incomplete
  108.                 j = m.end()
  109.             elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
  110.                 name, j = self._scan_name(j, i)
  111.             elif c in self._decl_otherchars:
  112.                 j = j + 1
  113.             elif c == "[":
  114.                 # this could be handled in a separate doctype parser
  115.                 if decltype == "doctype":
  116.                     j = self._parse_doctype_subset(j + 1, i)
  117.                 elif decltype in ("attlist", "linktype", "link", "element"):
  118.                     # must tolerate []'d groups in a content model in an element declaration
  119.                     # also in data attribute specifications of attlist declaration
  120.                     # also link type declaration subsets in linktype declarations
  121.                     # also link attribute specification lists in link declarations
  122.                     self.error("unsupported '[' char in %s declaration" % decltype)
  123.                 else:
  124.                     self.error("unexpected '[' char in declaration")
  125.             else:
  126.                 self.error(
  127.                     "unexpected %r char in declaration" % rawdata[j])
  128.             if j < 0:
  129.                 return j
  130.         return -1 # incomplete
  131.  
  132.     # Internal -- parse a marked section
  133.     # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
  134.     def parse_marked_section( self, i, report=1 ):
  135.         rawdata= self.rawdata
  136.         assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
  137.         sectName, j = self._scan_name( i+3, i )
  138.         if j < 0:
  139.             return j
  140.         if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
  141.             # look for standard ]]> ending
  142.             match= _markedsectionclose.search(rawdata, i+3)
  143.         elif sectName in ("if", "else", "endif"):
  144.             # look for MS Office ]> ending
  145.             match= _msmarkedsectionclose.search(rawdata, i+3)
  146.         else:
  147.             self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
  148.         if not match:
  149.             return -1
  150.         if report:
  151.             j = match.start(0)
  152.             self.unknown_decl(rawdata[i+3: j])
  153.         return match.end(0)
  154.  
  155.     # Internal -- parse comment, return length or -1 if not terminated
  156.     def parse_comment(self, i, report=1):
  157.         rawdata = self.rawdata
  158.         if rawdata[i:i+4] != '<!--':
  159.             self.error('unexpected call to parse_comment()')
  160.         match = _commentclose.search(rawdata, i+4)
  161.         if not match:
  162.             return -1
  163.         if report:
  164.             j = match.start(0)
  165.             self.handle_comment(rawdata[i+4: j])
  166.         return match.end(0)
  167.  
  168.     # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
  169.     # returning the index just past any whitespace following the trailing ']'.
  170.     def _parse_doctype_subset(self, i, declstartpos):
  171.         rawdata = self.rawdata
  172.         n = len(rawdata)
  173.         j = i
  174.         while j < n:
  175.             c = rawdata[j]
  176.             if c == "<":
  177.                 s = rawdata[j:j+2]
  178.                 if s == "<":
  179.                     # end of buffer; incomplete
  180.                     return -1
  181.                 if s != "<!":
  182.                     self.updatepos(declstartpos, j + 1)
  183.                     self.error("unexpected char in internal subset (in %r)" % s)
  184.                 if (j + 2) == n:
  185.                     # end of buffer; incomplete
  186.                     return -1
  187.                 if (j + 4) > n:
  188.                     # end of buffer; incomplete
  189.                     return -1
  190.                 if rawdata[j:j+4] == "<!--":
  191.                     j = self.parse_comment(j, report=0)
  192.                     if j < 0:
  193.                         return j
  194.                     continue
  195.                 name, j = self._scan_name(j + 2, declstartpos)
  196.                 if j == -1:
  197.                     return -1
  198.                 if name not in ("attlist", "element", "entity", "notation"):
  199.                     self.updatepos(declstartpos, j + 2)
  200.                     self.error(
  201.                         "unknown declaration %r in internal subset" % name)
  202.                 # handle the individual names
  203.                 meth = getattr(self, "_parse_doctype_" + name)
  204.                 j = meth(j, declstartpos)
  205.                 if j < 0:
  206.                     return j
  207.             elif c == "%":
  208.                 # parameter entity reference
  209.                 if (j + 1) == n:
  210.                     # end of buffer; incomplete
  211.                     return -1
  212.                 s, j = self._scan_name(j + 1, declstartpos)
  213.                 if j < 0:
  214.                     return j
  215.                 if rawdata[j] == ";":
  216.                     j = j + 1
  217.             elif c == "]":
  218.                 j = j + 1
  219.                 while j < n and rawdata[j].isspace():
  220.                     j = j + 1
  221.                 if j < n:
  222.                     if rawdata[j] == ">":
  223.                         return j
  224.                     self.updatepos(declstartpos, j)
  225.                     self.error("unexpected char after internal subset")
  226.                 else:
  227.                     return -1
  228.             elif c.isspace():
  229.                 j = j + 1
  230.             else:
  231.                 self.updatepos(declstartpos, j)
  232.                 self.error("unexpected char %r in internal subset" % c)
  233.         # end of buffer reached
  234.         return -1
  235.  
  236.     # Internal -- scan past <!ELEMENT declarations
  237.     def _parse_doctype_element(self, i, declstartpos):
  238.         name, j = self._scan_name(i, declstartpos)
  239.         if j == -1:
  240.             return -1
  241.         # style content model; just skip until '>'
  242.         rawdata = self.rawdata
  243.         if '>' in rawdata[j:]:
  244.             return rawdata.find(">", j) + 1
  245.         return -1
  246.  
  247.     # Internal -- scan past <!ATTLIST declarations
  248.     def _parse_doctype_attlist(self, i, declstartpos):
  249.         rawdata = self.rawdata
  250.         name, j = self._scan_name(i, declstartpos)
  251.         c = rawdata[j:j+1]
  252.         if c == "":
  253.             return -1
  254.         if c == ">":
  255.             return j + 1
  256.         while 1:
  257.             # scan a series of attribute descriptions; simplified:
  258.             #   name type [value] [#constraint]
  259.             name, j = self._scan_name(j, declstartpos)
  260.             if j < 0:
  261.                 return j
  262.             c = rawdata[j:j+1]
  263.             if c == "":
  264.                 return -1
  265.             if c == "(":
  266.                 # an enumerated type; look for ')'
  267.                 if ")" in rawdata[j:]:
  268.                     j = rawdata.find(")", j) + 1
  269.                 else:
  270.                     return -1
  271.                 while rawdata[j:j+1].isspace():
  272.                     j = j + 1
  273.                 if not rawdata[j:]:
  274.                     # end of buffer, incomplete
  275.                     return -1
  276.             else:
  277.                 name, j = self._scan_name(j, declstartpos)
  278.             c = rawdata[j:j+1]
  279.             if not c:
  280.                 return -1
  281.             if c in "'\"":
  282.                 m = _declstringlit_match(rawdata, j)
  283.                 if m:
  284.                     j = m.end()
  285.                 else:
  286.                     return -1
  287.                 c = rawdata[j:j+1]
  288.                 if not c:
  289.                     return -1
  290.             if c == "#":
  291.                 if rawdata[j:] == "#":
  292.                     # end of buffer
  293.                     return -1
  294.                 name, j = self._scan_name(j + 1, declstartpos)
  295.                 if j < 0:
  296.                     return j
  297.                 c = rawdata[j:j+1]
  298.                 if not c:
  299.                     return -1
  300.             if c == '>':
  301.                 # all done
  302.                 return j + 1
  303.  
  304.     # Internal -- scan past <!NOTATION declarations
  305.     def _parse_doctype_notation(self, i, declstartpos):
  306.         name, j = self._scan_name(i, declstartpos)
  307.         if j < 0:
  308.             return j
  309.         rawdata = self.rawdata
  310.         while 1:
  311.             c = rawdata[j:j+1]
  312.             if not c:
  313.                 # end of buffer; incomplete
  314.                 return -1
  315.             if c == '>':
  316.                 return j + 1
  317.             if c in "'\"":
  318.                 m = _declstringlit_match(rawdata, j)
  319.                 if not m:
  320.                     return -1
  321.                 j = m.end()
  322.             else:
  323.                 name, j = self._scan_name(j, declstartpos)
  324.                 if j < 0:
  325.                     return j
  326.  
  327.     # Internal -- scan past <!ENTITY declarations
  328.     def _parse_doctype_entity(self, i, declstartpos):
  329.         rawdata = self.rawdata
  330.         if rawdata[i:i+1] == "%":
  331.             j = i + 1
  332.             while 1:
  333.                 c = rawdata[j:j+1]
  334.                 if not c:
  335.                     return -1
  336.                 if c.isspace():
  337.                     j = j + 1
  338.                 else:
  339.                     break
  340.         else:
  341.             j = i
  342.         name, j = self._scan_name(j, declstartpos)
  343.         if j < 0:
  344.             return j
  345.         while 1:
  346.             c = self.rawdata[j:j+1]
  347.             if not c:
  348.                 return -1
  349.             if c in "'\"":
  350.                 m = _declstringlit_match(rawdata, j)
  351.                 if m:
  352.                     j = m.end()
  353.                 else:
  354.                     return -1    # incomplete
  355.             elif c == ">":
  356.                 return j + 1
  357.             else:
  358.                 name, j = self._scan_name(j, declstartpos)
  359.                 if j < 0:
  360.                     return j
  361.  
  362.     # Internal -- scan a name token and the new position and the token, or
  363.     # return -1 if we've reached the end of the buffer.
  364.     def _scan_name(self, i, declstartpos):
  365.         rawdata = self.rawdata
  366.         n = len(rawdata)
  367.         if i == n:
  368.             return None, -1
  369.         m = _declname_match(rawdata, i)
  370.         if m:
  371.             s = m.group()
  372.             name = s.strip()
  373.             if (i + len(s)) == n:
  374.                 return None, -1  # end of buffer
  375.             return name.lower(), m.end()
  376.         else:
  377.             self.updatepos(declstartpos, i)
  378.             self.error("expected name token at %r"
  379.                        % rawdata[declstartpos:declstartpos+20])
  380.  
  381.     # To be overridden -- handlers for unknown objects
  382.     def unknown_decl(self, data):
  383.         pass
  384.